library(tidyverse)
library(janitor)
library(palmerpenguins)
library(knitr)
penguins_raw <- read_csv(path_to_file("penguins_raw.csv")) %>%
clean_names()
opts_chunk$set(warning = FALSE, message = FALSE)
#https://www.tidymodels.org/learn/statistics/k-means/
#https://www.guru99.com/r-k-means-clustering.html
#https://afit-r.github.io/kmeans_clustering
library(skimr)
skim (penguins_raw)
Data summary
| Name |
penguins_raw |
| Number of rows |
344 |
| Number of columns |
17 |
| _______________________ |
|
| Column type frequency: |
|
| character |
9 |
| Date |
1 |
| numeric |
7 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| study_name |
0 |
1.00 |
7 |
7 |
0 |
3 |
0 |
| species |
0 |
1.00 |
33 |
41 |
0 |
3 |
0 |
| region |
0 |
1.00 |
6 |
6 |
0 |
1 |
0 |
| island |
0 |
1.00 |
5 |
9 |
0 |
3 |
0 |
| stage |
0 |
1.00 |
18 |
18 |
0 |
1 |
0 |
| individual_id |
0 |
1.00 |
4 |
6 |
0 |
190 |
0 |
| clutch_completion |
0 |
1.00 |
2 |
3 |
0 |
2 |
0 |
| sex |
11 |
0.97 |
4 |
6 |
0 |
2 |
0 |
| comments |
290 |
0.16 |
18 |
68 |
0 |
10 |
0 |
Variable type: Date
| date_egg |
0 |
1 |
2007-11-09 |
2009-12-01 |
2008-11-09 |
50 |
Variable type: numeric
| sample_number |
0 |
1.00 |
63.15 |
40.43 |
1.00 |
29.00 |
58.00 |
95.25 |
152.00 |
▇▇▆▅▃ |
| culmen_length_mm |
2 |
0.99 |
43.92 |
5.46 |
32.10 |
39.23 |
44.45 |
48.50 |
59.60 |
▃▇▇▆▁ |
| culmen_depth_mm |
2 |
0.99 |
17.15 |
1.97 |
13.10 |
15.60 |
17.30 |
18.70 |
21.50 |
▅▅▇▇▂ |
| flipper_length_mm |
2 |
0.99 |
200.92 |
14.06 |
172.00 |
190.00 |
197.00 |
213.00 |
231.00 |
▂▇▃▅▂ |
| body_mass_g |
2 |
0.99 |
4201.75 |
801.95 |
2700.00 |
3550.00 |
4050.00 |
4750.00 |
6300.00 |
▃▇▆▃▂ |
| delta_15_n_o_oo |
14 |
0.96 |
8.73 |
0.55 |
7.63 |
8.30 |
8.65 |
9.17 |
10.03 |
▃▇▆▅▂ |
| delta_13_c_o_oo |
13 |
0.96 |
-25.69 |
0.79 |
-27.02 |
-26.32 |
-25.83 |
-25.06 |
-23.79 |
▆▇▅▅▂ |
penguins <- penguins_raw %>%
rename (
bill_length = culmen_length_mm,
bill_depth = culmen_depth_mm,
flipper_length = flipper_length_mm,
body_mass = body_mass_g
) %>%
mutate (
id = row_number(),
species = word (species, 1),
bill_length = scale(bill_length),
bill_depth = scale(bill_depth),
flipper_length = scale(flipper_length),
body_mass = scale(body_mass)
) %>%
select (id, species, island, sex, bill_length, bill_depth, flipper_length, body_mass) %>%
drop_na
library (GGally)
ggpairs(
data = penguins_raw,
columns = c(10:14),
diag = list(continuous = wrap("barDiag", colour = "blue")),
upper = list(continuous = wrap("cor", size = 6))
)

Visualize potential clusters
library (factoextra)
library(glue)
kmeans_flex <- function (k) {
penguins_kmeans <- kmeans(penguins[5:7], k)
fviz_cluster(penguins_kmeans, geom = "point", data = penguins[5:7]) +
labs(title = glue("{k} clusters")) +
theme (
plot.title = element_text (margin = margin(0,0,5,0), hjust = 0.5, size = 15, family = "Lato"),
plot.background = element_blank(),
legend.text = element_text(hjust = 0, size = 13, family = "Lato")
)
}
map (1:10, kmeans_flex)
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

Identify optimal number of clusters.
library(broom)
library(here)
set.seed(920)
penguins_clust <-
tibble (k = 1:10) %>%
mutate (
cluster = map(k, ~kmeans(penguins[5:7], .x, nstart=25)),
glanced = map(cluster, glance)
) %>%
unnest (glanced) %>%
select (-cluster) %>%
clean_names %>%
write_csv (here("2020-07-28","output", "model.csv")) %>%
rename (
total = totss,
intra_cluster = tot_withinss,
inter_cluster = betweenss
) %>%
pivot_longer (
cols = total:inter_cluster,
names_to = "residual_type",
values_to = "residual_sum"
)
Identify optimal number of clusters
ggplot (penguins_clust, aes(x = k, y= residual_sum, fill = residual_type)) +
geom_col (data = penguins_clust %>% filter (residual_type != "total")) +
geom_line (data = penguins_clust %>% filter (residual_type == "intra_cluster"))

fviz_nbclust(penguins[5:7], kmeans, method = "wss") +
ggsave (here("2020-07-28","output", "optimal_k_wss.png"))

fviz_nbclust(penguins[5:7], kmeans, method = "silhouette") +
ggsave (here("2020-07-28","output", "optimal_k_silhouette.png"))

Calculate k-means for optimized 3 clusters
penguins_kmeans <- kmeans(penguins[5:7], 2)
penguins_kmeans_extended <- penguins_kmeans %>%
augment (penguins) %>%
rename (cluster = .cluster) %>%
select (cluster, species, island, sex, id) %>%
write_csv (here("2020-07-28","output", "assignments.csv"))
penguins_kmeans_extended %>%
count(species,cluster) %>%
ggplot(aes(x=species, y=cluster, fill = n)) +
geom_tile() +
ggsave (here("2020-07-28","output", "species_heatmap.png"))

penguins_kmeans_extended %>%
count(island,cluster) %>%
ggplot(aes(x=island, y=cluster, fill = n)) +
geom_tile() +
ggsave (here("2020-07-28","output", "island_heatmap.png"))

penguins_kmeans_extended %>%
count(sex,cluster) %>%
ggplot(aes(x=sex, y=cluster, fill = n)) +
geom_tile() +
ggsave (here("2020-07-28","output", "sex_heatmap.png"))
